**************************************************
* CREATE VIOLENT CRIME FILE FOR SPATIAL JOIN
**************************************************

clear
use "crime-chicago-2001-2022-spatially-joined.dta", clear

* 1. Restrict to 2011–2020 and rename type
keep inrange(year, 2011, 2020)
rename primarytype crimetype

* 2. Keep only Assault and Robbery 
keep latitude longitude casenumber date year crimetype
drop if crimetype == "HOMICIDE"
keep if inlist(crimetype, "ASSAULT", "ROBBERY")

tempfile crime_base
save `crime_base', replace


**************************************************
* APPEND SHOOTINGS & HOMICIDES WITH VICTIM INFO
**************************************************

import delimited "homicide-shootings-2004-2023.csv", clear
rename sex       vict_sex
rename race      vict_race
rename case_number casenumber
rename incident_primary crimetype

* 3. Classify non‑homicides as shootings
replace crimetype = "SHOOTING" if crimetype != "HOMICIDE"

* 4. Restrict to 2011–2020 and keep key vars
gen year = real(substr(date, -4, 4))
keep inrange(year, 2011, 2020) latitude longitude casenumber date crimetype vict_sex vict_race

* 5. Combine with Assault/Robbery base
append using `crime_base'
duplicates drop casenumber

tempfile allcrime
save `allcrime', replace


**************************************************
* MERGE PERPETRATOR RACE FROM ARREST RECORDS
**************************************************

import delimited "arrests-records.csv", clear
rename race perp_race
keep casenumber arrestdate perp_race
drop if missing(casenumber)

* 6. Restrict to 2011–2020
gen year = real(substr(arrestdate, -4, 4))
keep inrange(year, 2011, 2020)

* 7. Standardize perp_race categories
replace perp_race = "blk" if inlist(perp_race, "BLACK", "BLACK HISPANIC")
replace perp_race = "hsp" if perp_race == "WHITE HISPANIC"
replace perp_race = "wht" if perp_race == "WHITE"
replace perp_race = "oth" if inlist(perp_race, "ASIAN / PACIFIC ISLANDER", "AMER INDIAN / ALASKAN NATIVE")
replace perp_race = "unk" if perp_race == "UNKNOWN / REFUSED"

* 8. Compute first & last arrest dates and arrest count
gen arrestdate_num = date(substr(arrestdate,1,10), "MDY")
format arrestdate_num %td
bysort casenumber (arrestdate_num): egen arrestdate_first = min(arrestdate_num)
bysort casenumber (arrestdate_num): egen arrestdate_last  = max(arrestdate_num)
by casenumber: gen arrestees = _N

* 9. Create perp_race indicator vars
foreach r in blk hsp wht oth unk {
    gen perp_race_`r' = (perp_race == "`r'")
}
drop perp_race arrestdate arrestdate_num

* 10. Collapse to one record per case
collapse (max) perp_race_*, by(casenumber arrestdate_first arrestdate_last arrestees)

tempfile arrests
save `arrests', replace


**************************************************
* COMBINE CRIMES & SUSPECT RACE
**************************************************

use `allcrime', clear
merge m:1 casenumber using `arrests'
drop if _merge == 2

* 11. Mark missing arrests as unknown
replace perp_race_unk = 1 if _merge == 1
foreach v of varlist perp_race_blk perp_race_hsp perp_race_wht perp_race_oth {
    replace `v' = 0 if _merge == 1
}
drop _merge

* 12. Assign unique ID and clean coords
sort date latitude longitude
gen unique_crime_id = _n
rename latitude  crime_lat
rename longitude crime_lon
format crime_lat crime_lon %10.8g
drop if missing(crime_lat, crime_lon)

* 13. Create unique lat‑lon grouping
egen crime_unique_lat_lon = group(crime_lat crime_lon)

save "violent-crimes-linked-to-arrests-2011-2020.dta", replace



**************************************************
* EXPORT UNIQUE LAT–LON PAIRS FOR OFFICERS & CRIMES
**************************************************

* 1. Officer addresses (2012–2020)
use "masterfile.dta", clear
keep if add_has_address == 1
keep add_unique_lat_lon add_lat add_lon
duplicates drop
export delimited "unique-lat-long-officer-addresses-2012-2020.csv", replace

* 2. Violent crime locations (2011–2020)
use "violent-crimes-linked-to-arrests-2011-2020.dta", clear
keep crime_unique_lat_lon crime_lat crime_lon
duplicates drop
export delimited "unique-lat-long-violent-crimes-2011-2020.csv", replace


**************************************************
* SPATIAL JOIN IN ARCGIS
**************************************************

* In ArcGIS:
*   • Join "unique-lat-long-violent-crimes-2011-2020.csv" to:
*       – 2010 Block Groups
*       – 2023 Community Areas
*       – PHDCN Neighborhood Clusters
*   • Export as "unique-lat-long-violent-crimes-2011-2020-blockgroup-ca-nc.csv"


**************************************************
* IMPORT ENRICHED CRIME FILE & MERGE BACK
**************************************************

import delimited "unique-lat-long-violent-crimes-2011-2020-blockgroup-ca-nc.csv", clear
keep crime_unique_lat_lon crime_lat crime_lon area_numbe nc_num geo_id

* Extract blockgroup from geo_id (e.g., "1500000US170310101001")
split geo_id, parse("1500000US")
rename geo_id2 blockgroup
rename area_numbe ca_num
drop geo_id geo_id1

* Merge geographic fields back onto crime data
merge 1:1 crime_unique_lat_lon using "violent-crimes-linked-to-arrests-2011-2020.dta"
drop _merge

* Recreate date_num from string date
gen date_num = date(substr(date, 1, 10), "MDY")
format date_num %td

* Rename coords and reorder variables
rename crime_lat lat
rename crime_lon lon

order unique_crime_id crime_unique_lat_lon casenumber date date_num year crimetype ///
      lat lon blockgroup nc_num ca_num vict_sex vict_race arrestdate_first arrestdate_last arrestees perp_race_*

* Prefix all key fields with "crime_"
foreach var in lat lon blockgroup nc_num ca_num casenumber date date_num year crimetype vict_sex vict_race arrestdate_first arrestdate_last arrestees perp_race_* {
    rename `var' crime_`var'
}

* Variable labels
lab var crime_lat "Latitude of crime"
lab var crime_lon "Longitude of crime"
lab var crime_blockgroup "Block group of crime"
lab var crime_nc_num "NC of crime"
lab var crime_ca_num "CA of crime"

lab var crime_casenumber "Crime CPD case number"
lab var crime_date "Crime date"
lab var crime_date_num "Crime date"
lab var crime_year "Crime year"
lab var crime_crimetype "Crime type"
lab var crime_vict_sex "Crime victim sex"
lab var crime_vict_race "Crime victim race"
lab var crime_arrestdate_first "Date of first arrest connected to the crime"
lab var crime_arrestdate_last "Date of last arrest connected to the crime"
lab var crime_arrestees "Number of people arrested connected to the crime (as of Feb 23 2024)"
lab var crime_perp_race_blk "Any black arrests connected to the crime (as of Feb 23 2024)"
lab var crime_perp_race_hsp "Any hispanic arrests connected to the crime (as of Feb 23 2024)"
lab var crime_perp_race_wht "Any white arrests connected to the crime (as of Feb 23 2024)"
lab var crime_perp_race_oth "Any other arrests connected to the crime (as of Feb 23 2024)"
lab var crime_perp_race_unk "Any unknown race arrests connected to the crime (as of Feb 23 2024)"

compress
save "violent-crimes-linked-to-arrests-2011-2020-blockgroup-ca-nc.dta", replace

* Also save subset for shootings and homicides
keep if inlist(crime_crimetype, "SHOOTING", "HOMICIDE")
save "homicides-shootings-linked-to-arrests-2011-2020-blockgroup-ca-nc.dta", replace

